# match survey addresses to catastro
# -------------------------------------------------------------------- #
# prelims ####
# -------------------------------------------------------------------- #
# clear environment
rm(list = ls())

# libraries
library(haven)
library(rgdal)
library(raster)
library(readxl) 
library(classInt)
library(Hmisc) 
library(tidyverse)
library(units)
library(xlsx)

# directory
dir <- paste0("PATH_TO_MAIN_DIRECTORY_HERE")

# -------------------------------------------------------------------- #
# prepare data ####
# -------------------------------------------------------------------- #

# survey data - list of addresses
SurveyAdd <- read_dta(paste0(dir,"data/survey_netquest.dta")) %>%
  select(c("City","StreetType","StreetName","StreetNum","ZIPCode"))

# clean address string
CharToRemove <- paste(" DE", " DEL", "CALLE DEL ", "CALLE "," DELS ", "C/ ", "CALLE ", 
                      "CARRER"," LA ", "LOS", "^DEL ", "^DELS" , sep="|")

# street name
SurveyAdd$SurvStreetNameClean <- str_replace(SurveyAdd$StreetName, CharToRemove, "")
SurveyAdd$SurvStreetNameClean <- str_replace(SurveyAdd$SurvStreetNameClean, "PASEO", "PS")
SurveyAdd$SurvStreetNameClean <- str_replace(SurveyAdd$SurvStreetNameClean, "PASSAGE|PASAJE|PASSATGE", "PJ")
SurveyAdd$SurvStreetNameClean <- str_replace(SurveyAdd$SurvStreetNameClean, "CALLE|CARRER", "CL")

# new var with address string
SurveyAdd$SurvAdd <- paste(SurveyAdd$StreetType,
                           SurveyAdd$SurvStreetNameClean, 
                           SurveyAdd$StreetNum, 
                           SurveyAdd$ZIPCode, sep = " ") 
names(SurveyAdd)[names(SurveyAdd)=="ZIPCode"] <- "SurvZIP"
names(SurveyAdd)[names(SurveyAdd)=="StreetName"] <- "SurvStreetName"
names(SurveyAdd)[names(SurveyAdd)=="StreetNum"] <- "SurvStreetNum"
names(SurveyAdd)[names(SurveyAdd)=="StreetType"] <- "SurvStreetType"

# barcelona addresses in survey
SurveyAddBCN <- subset(SurveyAdd, as.numeric(SurvZIP)>=8000 & as.numeric(SurvZIP)<9000)

# import catastro data
NumDataBCN <- read_dta(paste0(dir,"data/int/08_900.dta")) %>% 
  select(c("PlotCode","MunName","StreetType","StreetName","StreetNum1",
           "StreetLetter1","StreetNum2","StreetLetter2", "ZIPCode","MunDistrict",
           "XCoord","YCoord")) %>% 
  distinct(PlotCode, StreetName, StreetNum1, .keep_all = TRUE) 
# add clean street name
NumDataBCN$StreetNameClean <- str_replace(NumDataBCN$StreetName, CharToRemove, "")
# add string for address
NumDataBCN$CATAdd <- paste(NumDataBCN$StreetType,
                        NumDataBCN$StreetNameClean, 
                        as.numeric(NumDataBCN$StreetNum1),
                        NumDataBCN$ZIPCode, sep = " ") 

# -------------------------------------------------------------------- #
# match to an address ####
# -------------------------------------------------------------------- #

# initialize matrix where matches will be stored
MatchAdd <- NULL

# names of datasets containing data
CATDATA <- get("NumDataBCN")
SURDATA <- get("SurveyAddBCN")

# first match within ZIP Code - these should be exact
ZIPMatch <- merge(x=SURDATA, y=CATDATA,by.x=c("SurvZIP"), by.y=c("ZIPCode"), all.x=TRUE, all.y=FALSE)
ZIPList <- unique(ZIPMatch$SurvZIP, incomparables = FALSE)

for(ZIP in ZIPList){

    # addresses in this ZIP Code in Survey
    SurAddZIP <- SURDATA$SurvAdd[SURDATA$SurvZIP==ZIP]
    # addresses in this ZIP Code in catastro data
    CATAddZIP <- subset(ZIPMatch, ZIPMatch$SurvZIP==ZIP) %>%
      select(c("CATAdd","PlotCode"))
    CATAddresses <- CATAddZIP$CATAdd

    # calculate Levenshtein distance
    dist.name<-adist(SurAddZIP, CATAddresses, partial = TRUE, ignore.case = TRUE)

    # take the pairs with the minimum distance
    min.name<-apply(dist.name, 1, min)
    
    # create dataset with best matches
    MatchAddZIP <-NULL
    for(i in 1:nrow(dist.name)){
      s2.i<-match(min.name[i],dist.name[i,]) # retrieve position of matched element in catastro
      s1.i<-i # position of matched element in survey
      MatchAddZIP <- rbind(data.frame(CATRow=s2.i,SurRow=s1.i, 
                                   SurAdd=SurAddZIP[s1.i], 
                                   CATMatch=CATAddZIP[s2.i,1],
                                   PlotCode=CATAddZIP[s2.i,2],
                                   adist=min.name[i]),MatchAddZIP)
    }
    MatchAdd <- rbind(MatchAdd, MatchAddZIP)
  
} 

MatchAdd$PlotCode <- as.character(MatchAdd$PlotCode)
MatchAdd$SurAdd <- as.character(MatchAdd$SurAdd)
MatchAdd$CATMatch <- as.character(MatchAdd$CATMatch)

# final dataset
Match <- merge(x = SurveyAdd, y = MatchAdd, by.x = c("SurvAdd"), by.y = c("SurAdd")) %>%
  select(-c("CATRow","SurRow"))
# store in excel
write.xlsx(Match, paste0(dir, "data/int/survey_netquest_match.xlsx"))

# -------------------------------------------------------------------- #
# closing ####
# -------------------------------------------------------------------- #

rm(list = ls())
